# Prepare NBER MORG data
# Depends on cps_functions.R
# all directories are relative to the root of the project

# setting options
options(mc.cores=2)

library(data.table)
library(dplyr)
## library(Hmisc)
library(ggplot2)
## library(ggsci)
library(haven)
library(EnvStats)

# read-in functions
source("./src/r/cps_functions.R")

# directory settings
data.dir <- "./data/raw/morg/"

## Occupation classification converters
occ1970.occ1990dd <- read_dta("./data/raw/daviddorn/occ1970_occ1990dd.dta")
occ1980.occ1990dd <- read_dta("./data/raw/daviddorn/occ1980_occ1990dd.dta")
occ1990.occ1990dd <- read_dta("./data/raw/daviddorn/occ1990_occ1990dd.dta")
occ2000.occ1990dd <- read_dta("./data/raw/daviddorn/occ2000_occ1990dd.dta")

occ2012.occ1990 <- fread("./data/generated/crosswalks/occ2012_occ1990.csv")
occ2012.occ1990$occ <- as.numeric(occ2012.occ1990$occ)
occ2012.occ1990$occ1990 <- as.numeric(occ2012.occ1990$occ1990)

occ00.occ1990 <- fread("./data/generated/crosswalks/occ00_occ1990.csv")
occ00.occ1990$occ <- as.numeric(occ00.occ1990$occ)
occ00.occ1990$occ1990 <- as.numeric(occ00.occ1990$occ1990)

occ1990dd.2digit <- fread("./data/raw/AcemogluAutor2011/census-prep-files/occ1990dd-recode.csv")
occ1990dd.2digit$occ1990dd <- as.numeric(occ1990dd.2digit$occ1990dd)


# Consumper price index obtained from https://fred.stlouisfed.org/series/DPCERG3A086NBEA
cpi <- fread("./data/raw/fred/DPCERG3A086NBEA.csv")
cpi <- cpi %>% rename(date=DATE,cpi=DPCERG3A086NBEA)
cpi$year <- as.numeric(substring(cpi$date,1,4))
base16 <- cpi$cpi[cpi$year==2016]
cpi <- cpi %>% mutate(cpi16=base16/cpi)

# Minimumwage from 82 transformed to 2016-dollars
minwage82 = 3.35
minwage16 = minwage82 * cpi$cpi16[cpi$year==1982]

varlist <- c("year","earnwt","hrwage16","hrwage16_pareto","hrwage16_original","cat3","earnwke16","uhourse","intmonth","gradeat","gradecp","ihigrdc","grade92","Recode","Recode_desc", "occ1990dd", 'age', 'race', 'sex', 'lfsr89', 'earnwke')
years <- 1990
drop.low <- 0
drop.high <- 0
months <- c(1:12)

morg.all <- read.transform.data(years,varlist,drop.low,drop.high,months)

# stack data
data.stacked <- morg.all[[years[[1]]]]
if (length(years)>1){
  for (year in years[2:length(years)]){
    print(year)
    print(ncol(morg.all[[year]]))
      data.stacked <- rbind(data.stacked,morg.all[[year]])
  }
}

# generate indicators for the three occupations with ind1 = svc, ind2 = rt, ind3 = abs
data.stacked <- data.stacked %>% mutate(ind1 = ifelse(cat3 == 'svc',1,0)) %>%
  mutate(ind2 = ifelse(cat3 == 'rt',1,0)) %>%
    mutate(ind3 = ifelse(cat3 == 'abs',1,0))

# compute log wages
data.stacked <- data.stacked %>% mutate(ln_w = log(hrwage16), ln_w_pareto = log(hrwage16_pareto))

# drop police, firefighters and other law enforcement), create indicator for wage sample
data.stacked.dropped <- data.stacked %>% filter(!occ1990dd %in% c(417,418,423)) %>%
    mutate(wagesample = ifelse(uhourse>=35 & hrwage16>=0.5*minwage16 & !is.na(earnwke) & !is.na(cat3),1,0))

# save for computing summary statistics
fwrite(data.stacked.dropped,"./data/generated/r/morg_data_stacked_90.csv")
